# Author: Stephen Situ
# In This project, we create many models for a NLP (Natural Language Processing) Multiclass Classification model using Multinomial Naive Bayes,
# simple dense NN, LSTM NN, GRU NN, Bidirectional NN, Conv 1D NN, and a tensorflow hub sentence encoder model. To preprocess the data, we create
# a text_vectorizer object and create embedding layers for our Neural Network that can be trained. Afterwards, we investigate various
# Classification metrics like accuracy, recall, precision, and F-1 score. We found that LSTM, Bidirectional, and GRU were the best models,
# and then we combined them to create an ensemble model that outpreformed all of them individually. We also investigated the speed/score tradeoff
# and found that the simple dense model preformed the best in this metric. Finally, we extracted the weights and the vocab of the LSTM embedded layer
# and visualized the word data on tensorflow projector (https://projector.tensorflow.org/).
# The Original tweet data can be found here: https://www.kaggle.com/datasets/datatattle/covid-19-nlp-text-classification
# Import Libraries
import numpy as np
import pandas as pd
import tensorflow as tf
# Read csv
df_1 = pd.read_csv("Corona_NLP_test.csv",encoding='iso-8859-1')
df_2 = pd.read_csv("Corona_NLP_train.csv",encoding='iso-8859-1')
df_3 = pd.concat([df_1, df_2])
# Drop duplicates
df_3 = df_3.drop_duplicates()
# Drop N/A's
df_3 = df_3.dropna(subset=['OriginalTweet', 'Sentiment'])
# Find unique values
unique_values = df_3['Sentiment'].unique()
unique_values
array(['Extremely Negative', 'Positive', 'Extremely Positive', 'Negative', 'Neutral'], dtype=object)
# Define function to replace labels with integers
def replace_sentiment(sentiment):
if sentiment == 'Extremely Negative':
return 0
elif sentiment == 'Negative':
return 1
elif sentiment == 'Neutral':
return 2
elif sentiment == 'Positive':
return 3
elif sentiment == 'Extremely Positive':
return 4
else:
return sentiment
# Use .apply method to replace
df_3['Sentiment'] = df_3['Sentiment'].apply(replace_sentiment)
# Find unique values
unique_values = df_3['Sentiment'].unique()
unique_values
array([0, 3, 4, 1, 2], dtype=int64)
# Check data types
df_3.dtypes
UserName int64 ScreenName int64 Location object TweetAt object OriginalTweet object Sentiment int64 dtype: object
df_3
UserName | ScreenName | Location | TweetAt | OriginalTweet | Sentiment | |
---|---|---|---|---|---|---|
0 | 1 | 44953 | NYC | 02-03-2020 | TRENDING: New Yorkers encounter empty supermar... | 0 |
1 | 2 | 44954 | Seattle, WA | 02-03-2020 | When I couldn't find hand sanitizer at Fred Me... | 3 |
2 | 3 | 44955 | NaN | 02-03-2020 | Find out how you can protect yourself and love... | 4 |
3 | 4 | 44956 | Chicagoland | 02-03-2020 | #Panic buying hits #NewYork City as anxious sh... | 1 |
4 | 5 | 44957 | Melbourne, Victoria | 03-03-2020 | #toiletpaper #dunnypaper #coronavirus #coronav... | 2 |
... | ... | ... | ... | ... | ... | ... |
41152 | 44951 | 89903 | Wellington City, New Zealand | 14-04-2020 | Airline pilots offering to stock supermarket s... | 2 |
41153 | 44952 | 89904 | NaN | 14-04-2020 | Response to complaint not provided citing COVI... | 0 |
41154 | 44953 | 89905 | NaN | 14-04-2020 | You know itÂs getting tough when @KameronWild... | 3 |
41155 | 44954 | 89906 | NaN | 14-04-2020 | Is it wrong that the smell of hand sanitizer i... | 2 |
41156 | 44955 | 89907 | i love you so much || he/him | 14-04-2020 | @TartiiCat Well new/used Rift S are going for ... | 1 |
44955 rows × 6 columns
# Train, test split wtih 80/20 split
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df_3, test_size=0.2)
train_data_y = train_data["Sentiment"]
train_data_x = train_data["OriginalTweet"]
test_data_y = test_data["Sentiment"]
test_data_x = test_data["OriginalTweet"]
test_data_y
24448 1 2470 0 6796 1 22922 3 181 2 .. 8426 0 2371 4 26589 2 26452 2 39898 4 Name: Sentiment, Length: 8991, dtype: int64
# Default TextVectorization function usage to create a text vectorizer
#import tensorflow as tf
#from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
# Note: in TensorFlow 2.6+, you no longer need "layers.experimental.preprocessing"
# you can use: "tf.keras.layers.TextVectorization", see https://github.com/tensorflow/tensorflow/releases/tag/v2.6.0 for more
# Use the default TextVectorization variables
#text_vectorizer = TextVectorization(max_tokens=None, # how many words in the vocabulary (all of the different words in your text)
#standardize="lower_and_strip_punctuation", # how to process text
#split="whitespace", # how to split tokens
#ngrams=None, # create groups of n-words?
#output_mode="int", # how to map tokens to numbers
#output_sequence_length=None) # how long should the output sequence of tokens be?
# pad_to_max_tokens=True) # Not valid if using max_tokens=None
# Find average number of tokens (words) in training Tweets
round(sum([len(i.split()) for i in test_data_x])/len(test_data_x))
31
# Setup text vectorization with custom variables
# For max_tokens (the number of words in the vocabulary), multiples of 10,000 (10,000, 20,000, 30,000)
#or the exact number of unique words in your text (e.g. 32,179) are common values.
# max length our sequences will be (e.g. how many words from a Tweet does our model see?)
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
max_vocab_length = 10000
max_length = 31
text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
output_mode="int",
output_sequence_length=max_length)
# Use .adapt method on training data
text_vectorizer.adapt(train_data_x)
# Test text vecotrizer
sample_sentence = "I hate covid a lot and it sucks"
text_vectorizer([sample_sentence])
<tf.Tensor: shape=(1, 31), dtype=int64, numpy= array([[ 15, 2198, 58, 7, 358, 4, 30, 2893, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)>
# Read a random tweet from training data and vectorize it
import random
random_sentence = random.choice(train_data_x)
print(f"Original text:\n{random_sentence}\
\n\nVectorized version:")
text_vectorizer([random_sentence])
Original text: someone close to my family tested positive for COVID 19 all she did was go to the grocery store Stay your ass home bruh like stop being fucking stupid Who tf cares if you wanna be out in the streets Vectorized version:
<tf.Tensor: shape=(1, 31), dtype=int64, numpy= array([[ 354, 317, 3, 37, 273, 577, 436, 10, 58, 52, 33, 263, 321, 74, 76, 3, 2, 23, 20, 115, 34, 1393, 78, 1, 70, 142, 122, 785, 1230, 64, 5812]], dtype=int64)>
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5] # most common tokens (notice the [UNK] token for "unknown" words)
bottom_5_words = words_in_vocab[-5:] # least common tokens
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}")
print(f"Bottom 5 least common words: {bottom_5_words}")
Number of words in vocab: 10000 Top 5 most common words: ['', '[UNK]', 'the', 'to', 'and'] Bottom 5 least common words: ['heri', 'hereÂ\x94', 'herbs', 'herbal', 'hemantsorenjmm']
# Create an embedding layer for neural network
tf.random.set_seed(42)
from tensorflow.keras import layers
embedding = layers.Embedding(input_dim=max_vocab_length, # set input shape
output_dim=128, # set size of embedding vector
embeddings_initializer="uniform", # default, intialize randomly
input_length=max_length, # how long is each input
name="embedding_1")
embedding
<keras.layers.core.embedding.Embedding at 0x26195c9ed00>
# Get a random sentence from training set
random_sentence = random.choice(train_data_x)
print(f"Original text:\n{random_sentence}\
\n\nEmbedded version:")
# Embed the random sentence (turn it into numerical representation)
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed,sample_embed[0][0]
Original text: In January I tweeted that Trump Trade Wars and economic sanctions caused food shortages, panic and desperation which directly contributed to the creation of the #coronavirus and his ultimate control of Supply Chain. This was deliberate, systemic, resulting in deadly consequences Embedded version:
(<tf.Tensor: shape=(1, 31, 128), dtype=float32, numpy= array([[[ 0.00257028, -0.04243337, -0.02343434, ..., -0.00133356, 0.04663134, 0.00035417], [-0.01242752, 0.02142335, -0.02424029, ..., 0.04241223, 0.0263625 , -0.04868319], [ 0.04266793, 0.03783042, -0.0123207 , ..., -0.02365079, -0.00954125, -0.01112708], ..., [-0.00710202, 0.03660338, 0.04749844, ..., -0.03742009, -0.04001515, 0.04663609], [ 0.03848192, -0.04844777, 0.0345685 , ..., -0.01402212, -0.04769395, -0.0404261 ], [-0.03521683, -0.00981399, -0.00284491, ..., -0.04792733, -0.01664252, -0.03293632]]], dtype=float32)>, <tf.Tensor: shape=(128,), dtype=float32, numpy= array([ 2.5702827e-03, -4.2433370e-02, -2.3434341e-02, 8.0344789e-03, 1.0745153e-03, -1.7929029e-02, 2.8977264e-02, -1.3977997e-03, -1.5810572e-02, -3.0726958e-02, 3.0961167e-02, -1.0513291e-03, 4.6770964e-02, -2.5705492e-02, 4.7481630e-02, 1.6134176e-02, 2.7894724e-02, -9.3294494e-03, 2.3151565e-02, -3.6803614e-02, -3.5249867e-02, -2.4860298e-02, 1.2418628e-03, 3.7282575e-02, -2.2311080e-02, -9.5168836e-03, -2.3742771e-02, 2.5782909e-02, 3.8015518e-02, -3.2022476e-02, -4.2527914e-03, -8.2745180e-03, -7.3838979e-05, -9.4988942e-03, 6.7463890e-03, -4.8547637e-02, -1.4423311e-02, -2.5712121e-02, 4.5580339e-02, -1.8875754e-02, 1.9473795e-02, -2.5681913e-02, -2.6389455e-02, -3.4608819e-02, 4.3781772e-03, -4.2988770e-03, -1.9616604e-02, -3.3366576e-02, -3.6651753e-02, 2.1555666e-02, -3.0836686e-03, 3.6140036e-02, 6.3757673e-03, 7.7042356e-03, -4.4482496e-02, 1.4548246e-02, -1.8304668e-02, 7.4930303e-03, 1.5066180e-02, 3.6475848e-02, 1.1626970e-02, -2.6960596e-03, -1.1266552e-02, -1.1290491e-02, 1.8606279e-02, -4.7292411e-02, -1.5093494e-02, -1.6912926e-02, -3.4358669e-02, -4.4665921e-02, 1.9346658e-02, -1.4779925e-02, -4.2759862e-02, 4.9276356e-02, 8.7143108e-04, -3.3007562e-02, -4.9956825e-02, 3.6116470e-02, -4.0328968e-02, -2.1484721e-02, -3.8409606e-03, 1.7727111e-02, 2.5041923e-03, -1.8877871e-03, 2.1625366e-02, -1.3571978e-02, 8.3489306e-03, 4.6845045e-02, 4.7346506e-02, -3.7503481e-02, 2.5321994e-02, -2.0060575e-02, -1.3522793e-02, 4.5044795e-03, 2.3088697e-02, -2.6669383e-02, -3.6229409e-02, -4.2654883e-02, -2.8200543e-02, -7.8269616e-03, 4.2770315e-02, 4.2281751e-02, 1.0402847e-02, 1.9751079e-03, -4.3409895e-02, -2.8616060e-02, -4.6483517e-02, -4.7296502e-02, -7.2064623e-03, -4.0106405e-02, -1.0915063e-02, 2.9840302e-02, -2.2052431e-02, 3.7582148e-02, -2.5108470e-02, 4.0385414e-02, 4.2068195e-02, -5.6435950e-03, 2.9582690e-02, 1.3326693e-02, 3.9552812e-02, 7.6060779e-03, -2.6535273e-02, 4.8560772e-02, 1.7586496e-02, -1.3335571e-03, 4.6631340e-02, 3.5417080e-04], dtype=float32)>)
# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def calculate_results(y_true, y_pred):
"""
Calculates model accuracy, precision, recall and f1 score of a binary classification model.
Args:
-----
y_true = true labels in the form of a 1D array
y_pred = predicted labels in the form of a 1D array
Returns a dictionary of accuracy, precision, recall, f1-score.
"""
# Calculate model accuracy
model_accuracy = accuracy_score(y_true, y_pred) * 100
# Calculate model precision, recall and f1 score using "weighted" average
model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
model_results = {"accuracy": model_accuracy,
"precision": model_precision,
"recall": model_recall,
"f1": model_f1}
return model_results
# Use a Multinomial Naive Bayes Model as a baseline
# TfidfVectorizer() is a Feature extraction technique that converts the text data into numerical features using
# the Term Frequency-Inverse Document Frequency (TF-IDF)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
# Create tokenization and modelling pipeline
model_0 = Pipeline([
("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
("clf", MultinomialNB()) # model the text
])
# Fit the pipeline to the training data
model_0.fit(train_data_x, train_data_y)
Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])
# Check results
model_0_results = calculate_results(y_true=test_data_y, y_pred=model_0.predict(test_data_x))
model_0_results
{'accuracy': 35.50216883550217, 'precision': 0.6135211640206998, 'recall': 0.3550216883550217, 'f1': 0.24722328062848883}
# Use a tensorflow simple dense model
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string") # inputs are 1-dimensional strings
x = text_vectorizer(inputs) # turn the input text into numbers
x = embedding(x) # create an embedding of the numerized numbers
x = layers.GlobalAveragePooling1D()(x) # lower the dimensionality of the embedding (try running the model without this layer and see what happens)
outputs = layers.Dense(5, activation="softmax")(x)
model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense")
model_1.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
optimizer=tf.keras.optimizers.Adam(),
metrics=["accuracy"])
history_1 = model_1.fit(train_data_x, # input sentences can be a list of strings due to text preprocessing layer built-in model
train_data_y,
epochs=5,
validation_data=(test_data_x, test_data_y))
Epoch 1/5 1124/1124 [==============================] - 15s 13ms/step - loss: 1.3669 - accuracy: 0.4093 - val_loss: 1.1985 - val_accuracy: 0.4977 Epoch 2/5 1124/1124 [==============================] - 14s 12ms/step - loss: 1.0428 - accuracy: 0.5981 - val_loss: 1.0484 - val_accuracy: 0.5781 Epoch 3/5 1124/1124 [==============================] - 14s 12ms/step - loss: 0.8619 - accuracy: 0.6927 - val_loss: 1.0119 - val_accuracy: 0.6086 Epoch 4/5 1124/1124 [==============================] - 14s 13ms/step - loss: 0.7496 - accuracy: 0.7428 - val_loss: 1.0160 - val_accuracy: 0.6165 Epoch 5/5 1124/1124 [==============================] - 15s 13ms/step - loss: 0.6721 - accuracy: 0.7755 - val_loss: 1.0519 - val_accuracy: 0.6151
# Use a LSTM model
tf.random.set_seed(42)
from tensorflow.keras import layers
model_2_embedding = layers.Embedding(input_dim=max_vocab_length,
output_dim=128,
embeddings_initializer="uniform",
input_length=max_length,
name="embedding_2")
# Create LSTM model
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_2_embedding(x)
print(x.shape)
# x = layers.LSTM(64, return_sequences=True)(x) # return vector for each word in the Tweet (you can stack RNN cells as long as return_sequences=True)
x = layers.LSTM(64)(x) # return vector for whole sequence
print(x.shape)
# x = layers.Dense(64, activation="relu")(x) # optional dense layer on top of output of LSTM cell
outputs = layers.Dense(5, activation="softmax")(x)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")
model_2.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
optimizer=tf.keras.optimizers.Adam(),
metrics=["accuracy"])
history_2 = model_2.fit(train_data_x, # input sentences can be a list of strings due to text preprocessing layer built-in model
train_data_y,
epochs=5,
validation_data=(test_data_x, test_data_y))
(None, 31, 128) (None, 64) Epoch 1/5 1124/1124 [==============================] - 29s 24ms/step - loss: 1.1258 - accuracy: 0.5354 - val_loss: 0.8976 - val_accuracy: 0.6538 Epoch 2/5 1124/1124 [==============================] - 27s 24ms/step - loss: 0.7640 - accuracy: 0.7199 - val_loss: 0.8219 - val_accuracy: 0.6944 Epoch 3/5 1124/1124 [==============================] - 27s 24ms/step - loss: 0.6431 - accuracy: 0.7736 - val_loss: 0.8477 - val_accuracy: 0.6867 Epoch 4/5 1124/1124 [==============================] - 26s 23ms/step - loss: 0.5376 - accuracy: 0.8151 - val_loss: 0.9264 - val_accuracy: 0.6751 Epoch 5/5 1124/1124 [==============================] - 26s 23ms/step - loss: 0.4387 - accuracy: 0.8521 - val_loss: 1.0318 - val_accuracy: 0.6717
# Use a GRU model
tf.random.set_seed(42)
from tensorflow.keras import layers
model_3_embedding = layers.Embedding(input_dim=max_vocab_length,
output_dim=128,
embeddings_initializer="uniform",
input_length=max_length,
name="embedding_3")
# Build an RNN using the GRU cell
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_3_embedding(x)
# x = layers.GRU(64, return_sequences=True) # stacking recurrent cells requires return_sequences=True
x = layers.GRU(64)(x)
# x = layers.Dense(64, activation="relu")(x) # optional dense layer after GRU cell
outputs = layers.Dense(5, activation="softmax")(x)
model_3 = tf.keras.Model(inputs, outputs, name="model_3_GRU")
model_3.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
optimizer=tf.keras.optimizers.Adam(),
metrics=["accuracy"])
# Fit model
history_3 = model_3.fit(train_data_x, # input sentences can be a list of strings due to text preprocessing layer built-in model
train_data_y,
epochs=5,
validation_data=(test_data_x, test_data_y))
Epoch 1/5 1124/1124 [==============================] - 30s 25ms/step - loss: 1.1551 - accuracy: 0.5164 - val_loss: 0.8801 - val_accuracy: 0.6639 Epoch 2/5 1124/1124 [==============================] - 27s 24ms/step - loss: 0.7519 - accuracy: 0.7256 - val_loss: 0.8142 - val_accuracy: 0.6995 Epoch 3/5 1124/1124 [==============================] - 27s 24ms/step - loss: 0.6182 - accuracy: 0.7800 - val_loss: 0.8299 - val_accuracy: 0.6922 Epoch 4/5 1124/1124 [==============================] - 28s 25ms/step - loss: 0.4928 - accuracy: 0.8313 - val_loss: 0.9421 - val_accuracy: 0.6687 Epoch 5/5 1124/1124 [==============================] - 29s 26ms/step - loss: 0.3799 - accuracy: 0.8734 - val_loss: 1.0371 - val_accuracy: 0.6637
# Use a bidirectional RNN model
tf.random.set_seed(42)
from tensorflow.keras import layers
model_4_embedding = layers.Embedding(input_dim=max_vocab_length,
output_dim=128,
embeddings_initializer="uniform",
input_length=max_length,
name="embedding_4")
# Build a Bidirectional RNN in TensorFlow
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_4_embedding(x)
# x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x) # stacking RNN layers requires return_sequences=True
x = layers.Bidirectional(layers.LSTM(64))(x) # bidirectional goes both ways so has double the parameters of a regular LSTM layer
outputs = layers.Dense(5, activation="softmax")(x)
model_4 = tf.keras.Model(inputs, outputs, name="model_4_Bidirectional")
# Compile
model_4.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
optimizer=tf.keras.optimizers.Adam(),
metrics=["accuracy"])
# Fit the model (takes longer because of the bidirectional layers)
history_4 = model_4.fit(train_data_x, # input sentences can be a list of strings due to text preprocessing layer built-in model
train_data_y,
epochs=5,
validation_data=(test_data_x, test_data_y))
Epoch 1/5 1124/1124 [==============================] - 42s 34ms/step - loss: 1.0945 - accuracy: 0.5523 - val_loss: 0.8854 - val_accuracy: 0.6619 Epoch 2/5 1124/1124 [==============================] - 37s 33ms/step - loss: 0.7448 - accuracy: 0.7264 - val_loss: 0.8200 - val_accuracy: 0.6936 Epoch 3/5 1124/1124 [==============================] - 37s 33ms/step - loss: 0.6063 - accuracy: 0.7808 - val_loss: 0.8650 - val_accuracy: 0.6862 Epoch 4/5 1124/1124 [==============================] - 38s 33ms/step - loss: 0.4742 - accuracy: 0.8311 - val_loss: 0.9551 - val_accuracy: 0.6679 Epoch 5/5 1124/1124 [==============================] - 38s 34ms/step - loss: 0.3461 - accuracy: 0.8809 - val_loss: 1.1107 - val_accuracy: 0.6707
# Use a 1-d convolutional layer model
tf.random.set_seed(42)
from tensorflow.keras import layers
model_5_embedding = layers.Embedding(input_dim=max_vocab_length,
output_dim=128,
embeddings_initializer="uniform",
input_length=max_length,
name="embedding_5")
# Create 1-dimensional convolutional layer to model sequences
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_5_embedding(x)
x = layers.Conv1D(filters=32, kernel_size=5, activation="relu")(x)
x = layers.GlobalMaxPool1D()(x)
# x = layers.Dense(64, activation="relu")(x) # optional dense layer
outputs = layers.Dense(5, activation="softmax")(x)
model_5 = tf.keras.Model(inputs, outputs, name="model_5_Conv1D")
# Compile Conv1D model
model_5.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
optimizer=tf.keras.optimizers.Adam(),
metrics=["accuracy"])
history_5 = model_5.fit(train_data_x, # input sentences can be a list of strings due to text preprocessing layer built-in model
train_data_y,
epochs=5,
validation_data=(test_data_x, test_data_y))
Epoch 1/5 1124/1124 [==============================] - 16s 14ms/step - loss: 1.1869 - accuracy: 0.5063 - val_loss: 1.0052 - val_accuracy: 0.6145 Epoch 2/5 1124/1124 [==============================] - 15s 13ms/step - loss: 0.8448 - accuracy: 0.6870 - val_loss: 0.9755 - val_accuracy: 0.6220 Epoch 3/5 1124/1124 [==============================] - 15s 14ms/step - loss: 0.6060 - accuracy: 0.7933 - val_loss: 1.0282 - val_accuracy: 0.6161 Epoch 4/5 1124/1124 [==============================] - 15s 14ms/step - loss: 0.3631 - accuracy: 0.8916 - val_loss: 1.1664 - val_accuracy: 0.6012 Epoch 5/5 1124/1124 [==============================] - 16s 15ms/step - loss: 0.1853 - accuracy: 0.9544 - val_loss: 1.3613 - val_accuracy: 0.5854
# Use a transfer learning model with sentence encoder
import tensorflow_hub as hub
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
input_shape=[], # shape of inputs coming to our model
dtype=tf.string, # data type of inputs coming to the USE layer
trainable=False, # keep the pretrained weights (we'll create a feature extractor)
name="USE")
# Create model using the Sequential API
model_6 = tf.keras.Sequential([
sentence_encoder_layer, # take in sentences and then encode them into an embedding
layers.Dense(64, activation="relu"),
layers.Dense(5, activation="softmax")
], name="model_6_USE")
# Compile model
model_6.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
optimizer=tf.keras.optimizers.Adam(),
metrics=["accuracy"])
# Train a classifier on top of pretrained embeddings
history_6 = model_6.fit(train_data_x, # input sentences can be a list of strings due to text preprocessing layer built-in model
train_data_y,
epochs=5,
validation_data=(test_data_x, test_data_y))
Epoch 1/5 1124/1124 [==============================] - 12s 9ms/step - loss: 1.2914 - accuracy: 0.4389 - val_loss: 1.2366 - val_accuracy: 0.4670 Epoch 2/5 1124/1124 [==============================] - 9s 8ms/step - loss: 1.2098 - accuracy: 0.4787 - val_loss: 1.2213 - val_accuracy: 0.4750 Epoch 3/5 1124/1124 [==============================] - 9s 8ms/step - loss: 1.1948 - accuracy: 0.4882 - val_loss: 1.2169 - val_accuracy: 0.4791 Epoch 4/5 1124/1124 [==============================] - 9s 8ms/step - loss: 1.1810 - accuracy: 0.4942 - val_loss: 1.2119 - val_accuracy: 0.4750 Epoch 5/5 1124/1124 [==============================] - 9s 8ms/step - loss: 1.1674 - accuracy: 0.5023 - val_loss: 1.2093 - val_accuracy: 0.4843
model_0_results = calculate_results(y_true=test_data_y, y_pred=model_0.predict(test_data_x))
model_1_results = calculate_results(y_true=test_data_y, y_pred=model_1.predict(test_data_x).argmax(axis=1))
model_2_results = calculate_results(y_true=test_data_y, y_pred=model_2.predict(test_data_x).argmax(axis=1))
model_3_results = calculate_results(y_true=test_data_y, y_pred=model_3.predict(test_data_x).argmax(axis=1))
model_4_results = calculate_results(y_true=test_data_y, y_pred=model_4.predict(test_data_x).argmax(axis=1))
model_5_results = calculate_results(y_true=test_data_y, y_pred=model_5.predict(test_data_x).argmax(axis=1))
model_6_results = calculate_results(y_true=test_data_y, y_pred=model_6.predict(test_data_x).argmax(axis=1))
ensemble_results = calculate_results(y_true=test_data_y, y_pred=((model_2.predict(test_data_x) + model_3.predict(test_data_x) + model_4.predict(test_data_x))/3).argmax(axis=1))
281/281 [==============================] - 0s 1ms/step 281/281 [==============================] - 2s 6ms/step 281/281 [==============================] - 2s 7ms/step 281/281 [==============================] - 2s 8ms/step 281/281 [==============================] - 1s 2ms/step 281/281 [==============================] - 2s 6ms/step 281/281 [==============================] - 1s 5ms/step 281/281 [==============================] - 1s 5ms/step 281/281 [==============================] - 2s 7ms/step
# Combine model results into a DataFrame
all_model_results = pd.DataFrame({"naive bayes": model_0_results,
"simple_dense": model_1_results,
"lstm": model_2_results,
"gru": model_3_results,
"bidirectional": model_4_results,
"conv1d": model_5_results,
"tf_hub_sentence_encoder": model_6_results,
"ensemble_results":ensemble_results})
all_model_results = all_model_results.transpose()
# Reduce the accuracy to same scale as other metrics
all_model_results["accuracy"] = all_model_results["accuracy"]/100
all_model_results
accuracy | precision | recall | f1 | |
---|---|---|---|---|
naive bayes | 0.355022 | 0.613521 | 0.355022 | 0.247223 |
simple_dense | 0.615060 | 0.619968 | 0.615060 | 0.614946 |
lstm | 0.671672 | 0.672680 | 0.671672 | 0.671852 |
gru | 0.663664 | 0.666331 | 0.663664 | 0.664719 |
bidirectional | 0.670671 | 0.674650 | 0.670671 | 0.671100 |
conv1d | 0.585363 | 0.584906 | 0.585363 | 0.584887 |
tf_hub_sentence_encoder | 0.484262 | 0.488389 | 0.484262 | 0.484484 |
ensemble_results | 0.688466 | 0.690916 | 0.688466 | 0.689124 |
# Plot and compare all of the model results
all_model_results.plot(kind="bar", figsize=(10, 7)).legend(bbox_to_anchor=(1.0, 1.0));
# Sort model results by f1-score
all_model_results.sort_values("f1", ascending=False)["f1"].plot(kind="bar", figsize=(10, 7));
# Want to get weights of embedding layer for LSTM model
model_2.summary()
Model: "model_2_LSTM" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input_6 (InputLayer) [(None, 1)] 0 text_vectorization_1 (TextV (None, 31) 0 ectorization) embedding_2 (Embedding) (None, 31, 128) 1280000 lstm_1 (LSTM) (None, 64) 49408 dense_5 (Dense) (None, 5) 325 ================================================================= Total params: 1,329,733 Trainable params: 1,329,733 Non-trainable params: 0 _________________________________________________________________
# Save weights and vocab of LSTM model for tensorflow projector
import io
weights = model_2.get_layer('embedding_2').get_weights()[0]
vocab = text_vectorizer.get_vocabulary()
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')
for index, word in enumerate(vocab):
if index == 0:
continue # skip 0, it's padding.
vec = weights[index]
out_v.write('\t'.join([str(x) for x in vec]) + "\n")
out_m.write(word + "\n")
out_v.close()
out_m.close()
# Calculate the time of predictions
import time
def pred_timer(model, samples):
"""
Times how long a model takes to make predictions on samples.
Args:
----
model = a trained model
sample = a list of samples
Returns:
----
total_time = total elapsed time for model to make predictions on samples
time_per_pred = time in seconds per single sample
"""
start_time = time.perf_counter() # get start time
model.predict(samples) # make predictions
end_time = time.perf_counter() # get finish time
total_time = end_time-start_time # calculate how long predictions took to make
time_per_pred = total_time/len(test_data_x) # find prediction time per sample
return total_time, time_per_pred
# Calculate prediction times for all models
model_0_total_pred_time, model_0_time_per_pred = pred_timer(model_0, test_data_x)
model_1_total_pred_time, model_1_time_per_pred = pred_timer(model_1, test_data_x)
model_2_total_pred_time, model_2_time_per_pred = pred_timer(model_2, test_data_x)
model_3_total_pred_time, model_3_time_per_pred = pred_timer(model_3, test_data_x)
model_4_total_pred_time, model_4_time_per_pred = pred_timer(model_4, test_data_x)
model_5_total_pred_time, model_5_time_per_pred = pred_timer(model_5, test_data_x)
model_6_total_pred_time, model_6_time_per_pred = pred_timer(model_6, test_data_x)
281/281 [==============================] - 0s 2ms/step 281/281 [==============================] - 1s 5ms/step 281/281 [==============================] - 2s 6ms/step 281/281 [==============================] - 2s 8ms/step 281/281 [==============================] - 1s 2ms/step 281/281 [==============================] - 2s 6ms/step
# Make Scatter plot
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 7))
plt.scatter(model_0_time_per_pred, model_0_results["f1"], label="naive bayes")
plt.scatter(model_1_time_per_pred, model_1_results["f1"], label="simple_dense")
plt.scatter(model_2_time_per_pred, model_2_results["f1"], label="lstm")
plt.scatter(model_3_time_per_pred, model_3_results["f1"], label="gru")
plt.scatter(model_4_time_per_pred, model_4_results["f1"], label="bidirectional")
plt.scatter(model_5_time_per_pred, model_5_results["f1"], label="conv1d")
plt.scatter(model_6_time_per_pred, model_6_results["f1"], label="tf_hub_sentence_encoder")
plt.legend()
plt.title("F1-score versus time per prediction")
plt.xlabel("Time per prediction")
plt.ylabel("F1-Score");